{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bdc99287-5f4e-452f-b9dc-6db9c179b230",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "92dbe2a7-63a6-480e-b8c0-2a0b1a122c85",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = [\"The first sentence, which is the longest one in the list.\",\n",
    "            \"The second sentence is not that long.\",\n",
    "            \"A very short sentence.\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "161d626d-39e0-4ff1-ac2d-60684a411f60",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "23bf4a2d-16bd-45da-a440-f6daf2c083d5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [[101, 1109, 1148, 5650, 117, 1134, 1110, 1103, 6119, 1141, 1107, 1103, 2190, 119, 102], [101, 1109, 1248, 5650, 1110, 1136, 1115, 1263, 119, 102], [101, 138, 1304, 1603, 5650, 119, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}\n"
     ]
    }
   ],
   "source": [
    "tokenized_input = tokenizer(sentences)\n",
    "print(tokenized_input)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e1a8586d-9480-4c28-8c41-639c0bbe08f7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['[CLS]', 'The', 'first', 'sentence', ',', 'which', 'is', 'the', 'longest', 'one', 'in', 'the', 'list', '.', '[SEP]']\n"
     ]
    }
   ],
   "source": [
    "tokens = tokenizer.convert_ids_to_tokens(tokenized_input[\"input_ids\"][0])\n",
    "print(tokens)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
